Hands-on Exercise 8: Calibrating Hedonic Pricing Model for Private Highrise Property with Geographically weighted regression (GWR)

Author

William

Published

October 17, 2024

Modified

October 18, 2024

In this exercise, I am applying GWR to develop hedonic pricing models, where structural and locational variables are used to model 2015 resale condo prices.

R Packages used

  • OLS and performing diagnostics tests

  • Calibrating geographical weighted family of models

  • Multivariate data visualisation and analysis

  • Spatial data handling

    • sf
  • Attribute data handling

    • tidyverse, especially readr, ggplot2 and dplyr
  • Choropleth mapping

    • tmap
pacman::p_load(olsrr, corrplot, ggpubr, sf, spdep, GWmodel, tmap, tidyverse, gtsummary)

GW models suit situations when data are not described well by some global model, but where there are spatial regions where a suitably localised calibration provides a better description.

GWmodel includes functions to calibrate: GW summary statistics, GW principal components analysis, GW discriminant analysis and various forms of GW regression; some of which are provided in basic and robust (outlier resistant) forms.

Discriminant analysis

used to analyze data when the dependent variable is categorical and the independent variable is interval in nature

Data

URA Master Plan subzone boundary

mpsz <- st_read(dsn = 'data/geospatial/', layer = 'MP14_SUBZONE_WEB_PL')

Resale prices of condominium in 2015

condo_resale <- read_csv('data/aspatial/Condo_resale_2015.csv')

Wrangling

URA Master Plan subzone boundary

Since mpsz does not have EPSG information, use st_tranform() to modify the projection of mpsz

mpsz_3414 <- st_transform(mpsz, 3414)

# to verify
st_crs(mpsz)

Reveal the extent of mpsz_3414

# Returns bounding of a simple feature or simple feature set
st_bbox(mpsz_3414)

Resale prices of condominium in 2015

summary(condo_resale)
condo_resale_sf <- condo_resale %>% st_as_sf(coords = c('LONGITUDE','LATITUDE'), crs=4326) %>%
    st_transform(crs=3414) 
condo_resale_sf

EDA using ggplot2

Univariate

SELLING_PRICE

Code
ggplot(data=condo_resale_sf, aes(x=`SELLING_PRICE`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

Since it has a right-skewed distribution, normalise using logarithmic transformation.

LOG_SELLING_PRICE

condo_resale_sf <- condo_resale_sf %>% 
    mutate(`LOG_SELLING_PRICE` = log(SELLING_PRICE))

SELLING_PRICE vs LOG_SELLING_PRICE

Code
ggplot(data=condo_resale_sf, aes(x=`SELLING_PRICE`)) +
    geom_histogram(bins=20, color="black", fill="light blue")
ggplot(data=condo_resale_sf, aes(x=`LOG_SELLING_PRICE`)) +
    geom_histogram(bins=20, color="black", fill="light blue")

Multivariate (Trellis)

Code
AREA_SQM <- ggplot(data=condo_resale_sf, aes(x= `AREA_SQM`)) + 
  geom_histogram(bins=20, color="black", fill="light blue")

AGE <- ggplot(data=condo_resale_sf, aes(x= `AGE`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_CBD <- ggplot(data=condo_resale_sf, aes(x= `PROX_CBD`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_CHILDCARE <- ggplot(data=condo_resale_sf, aes(x= `PROX_CHILDCARE`)) + 
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_ELDERLYCARE <- ggplot(data=condo_resale_sf, aes(x= `PROX_ELDERLYCARE`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_URA_GROWTH_AREA <- ggplot(data=condo_resale_sf, 
                               aes(x= `PROX_URA_GROWTH_AREA`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_HAWKER_MARKET <- ggplot(data=condo_resale_sf, aes(x= `PROX_HAWKER_MARKET`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_KINDERGARTEN <- ggplot(data=condo_resale_sf, aes(x= `PROX_KINDERGARTEN`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_MRT <- ggplot(data=condo_resale_sf, aes(x= `PROX_MRT`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_PARK <- ggplot(data=condo_resale_sf, aes(x= `PROX_PARK`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_PRIMARY_SCH <- ggplot(data=condo_resale_sf, aes(x= `PROX_PRIMARY_SCH`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

PROX_TOP_PRIMARY_SCH <- ggplot(data=condo_resale_sf, 
                               aes(x= `PROX_TOP_PRIMARY_SCH`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggarrange(AREA_SQM, AGE, PROX_CBD, PROX_CHILDCARE, PROX_ELDERLYCARE, 
          PROX_URA_GROWTH_AREA, PROX_HAWKER_MARKET, PROX_KINDERGARTEN, PROX_MRT,
          PROX_PARK, PROX_PRIMARY_SCH, PROX_TOP_PRIMARY_SCH,  
          ncol = 3, nrow = 4)

Statistical Point Map

tmap_mode("view")
Code
tmap_options(check.and.fix = TRUE)
tm_shape(mpsz_3414)+
  tm_polygons() +
tm_shape(condo_resale_sf) +  
  tm_dots(col = "SELLING_PRICE",
          alpha = 0.6,
          style="quantile") +
  tm_view(set.zoom.limits = c(11,14))
tmap_mode("plot")

Hedonic Pricing Models in R using lm()

Simple Linear Regression

Obtain and print a summary analysis

Code
condo_slr <- lm(
    SELLING_PRICE ~ AREA_SQM, 
    condo_resale_sf)
summary(condo_slr)

Call:
lm(formula = SELLING_PRICE ~ AREA_SQM, data = condo_resale_sf)

Residuals:
     Min       1Q   Median       3Q      Max 
-3695815  -391764   -87517   258900 13503875 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -258121.1    63517.2  -4.064 5.09e-05 ***
AREA_SQM      14719.0      428.1  34.381  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 942700 on 1434 degrees of freedom
Multiple R-squared:  0.4518,    Adjusted R-squared:  0.4515 
F-statistic:  1182 on 1 and 1434 DF,  p-value: < 2.2e-16
  • SESELLING_PRICE = 14719*AREA_SQM - 258121.1

  • R-squared = 0.4518 (model is able to explain ~45% of actual condonresale prices

  • Model p-values <<< 0.001 (reject H0 that mean is a good estimator of SELLING_PRICE, hence there is sufficient statistic evidence that simple linear regression model above is a good estimator of SELLING_PRICE.

  • Both coefficient p-values <<< 0.001 (reject H0 … and can infer that both coefficients are good parameter estimates)

Visualise best fit curve on a scatterplot

Code
# using lm() as a method function
ggplot(data=condo_resale_sf,  
       aes(x=`AREA_SQM`, y=`SELLING_PRICE`)) +
  geom_point() +
  geom_smooth(method = lm)

There are a few statistical outliers with relatively high selling prices.

Multiple Linear Regression

Visualising the relationships of the independent variables (to identify highly correlated variables)

Code
# corrplot
corrplot(cor(
    condo_resale[,5:23]),
    diag = FALSE,      # correlation coefficients are hidden
    order = 'AOE',
    tl.pos = 'td',     # top-diagonal text labels
    tl.cex = .5,       # size of text labels
    method = 'number', # visualisation method
    type = 'upper'     # displays upper half triangle of correlation matrix
)

Freehold is highly correlated to LEASE_99YEAR. In view of this, it is wiser to only include either one of them in the subsequent model building. As a result, LEASE_99YEAR is excluded in the subsequent model building

Building the model

Code
# using lm()
condo_mlr <- lm(formula = SELLING_PRICE ~ AREA_SQM + AGE    + 
                  PROX_CBD + PROX_CHILDCARE + PROX_ELDERLYCARE +
                  PROX_URA_GROWTH_AREA + PROX_HAWKER_MARKET + PROX_KINDERGARTEN + 
                  PROX_MRT  + PROX_PARK + PROX_PRIMARY_SCH + 
                  PROX_TOP_PRIMARY_SCH + PROX_SHOPPING_MALL + PROX_SUPERMARKET + 
                  PROX_BUS_STOP + NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD, 
                data=condo_resale_sf)
summary(condo_mlr)

Call:
lm(formula = SELLING_PRICE ~ AREA_SQM + AGE + PROX_CBD + PROX_CHILDCARE + 
    PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + PROX_HAWKER_MARKET + 
    PROX_KINDERGARTEN + PROX_MRT + PROX_PARK + PROX_PRIMARY_SCH + 
    PROX_TOP_PRIMARY_SCH + PROX_SHOPPING_MALL + PROX_SUPERMARKET + 
    PROX_BUS_STOP + NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD, 
    data = condo_resale_sf)

Residuals:
     Min       1Q   Median       3Q      Max 
-3475964  -293923   -23069   241043 12260381 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           481728.40  121441.01   3.967 7.65e-05 ***
AREA_SQM               12708.32     369.59  34.385  < 2e-16 ***
AGE                   -24440.82    2763.16  -8.845  < 2e-16 ***
PROX_CBD              -78669.78    6768.97 -11.622  < 2e-16 ***
PROX_CHILDCARE       -351617.91  109467.25  -3.212  0.00135 ** 
PROX_ELDERLYCARE      171029.42   42110.51   4.061 5.14e-05 ***
PROX_URA_GROWTH_AREA   38474.53   12523.57   3.072  0.00217 ** 
PROX_HAWKER_MARKET     23746.10   29299.76   0.810  0.41782    
PROX_KINDERGARTEN     147468.99   82668.87   1.784  0.07466 .  
PROX_MRT             -314599.68   57947.44  -5.429 6.66e-08 ***
PROX_PARK             563280.50   66551.68   8.464  < 2e-16 ***
PROX_PRIMARY_SCH      180186.08   65237.95   2.762  0.00582 ** 
PROX_TOP_PRIMARY_SCH    2280.04   20410.43   0.112  0.91107    
PROX_SHOPPING_MALL   -206604.06   42840.60  -4.823 1.57e-06 ***
PROX_SUPERMARKET      -44991.80   77082.64  -0.584  0.55953    
PROX_BUS_STOP         683121.35  138353.28   4.938 8.85e-07 ***
NO_Of_UNITS             -231.18      89.03  -2.597  0.00951 ** 
FAMILY_FRIENDLY       140340.77   47020.55   2.985  0.00289 ** 
FREEHOLD              359913.01   49220.22   7.312 4.38e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 755800 on 1417 degrees of freedom
Multiple R-squared:  0.6518,    Adjusted R-squared:  0.6474 
F-statistic: 147.4 on 18 and 1417 DF,  p-value: < 2.2e-16

Revising the Model

We still have to improve the model by removing statistically insignificant variables

Tests and packages used:

  • olsrr

    • ols_regress() - model summary

    • ols_vif_tol() - check multicolinearity

    • ols_plot_resid_fit() - check linearity

    • ols_plot_resid_hist() or ols_test_normality() - check normality

    • check autocorrelation

  • tbl_regression() from gtsummary

[Remove] statistically insignificant variables

condo_mlr1 <- lm(formula = SELLING_PRICE ~ AREA_SQM + AGE + 
                   PROX_CBD + PROX_CHILDCARE + PROX_ELDERLYCARE +
                   PROX_URA_GROWTH_AREA + PROX_MRT  + PROX_PARK + 
                   PROX_PRIMARY_SCH + PROX_SHOPPING_MALL + PROX_BUS_STOP + 
                   NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD,
                data=condo_resale_sf)

[VIF] ols_vif_tol() - check multicolinearity

Code
ols_vif_tol(condo_mlr1)
              Variables Tolerance      VIF
1              AREA_SQM 0.8728554 1.145665
2                   AGE 0.7071275 1.414172
3              PROX_CBD 0.6356147 1.573280
4        PROX_CHILDCARE 0.3066019 3.261559
5      PROX_ELDERLYCARE 0.6598479 1.515501
6  PROX_URA_GROWTH_AREA 0.7510311 1.331503
7              PROX_MRT 0.5236090 1.909822
8             PROX_PARK 0.8279261 1.207837
9      PROX_PRIMARY_SCH 0.4524628 2.210126
10   PROX_SHOPPING_MALL 0.6738795 1.483945
11        PROX_BUS_STOP 0.3514118 2.845664
12          NO_Of_UNITS 0.6901036 1.449058
13      FAMILY_FRIENDLY 0.7244157 1.380423
14             FREEHOLD 0.6931163 1.442759

VIF < 10: no sign of multi-colinearity

[Residual] ols_plot_resid_fit() - check linearity

Code
ols_plot_resid_fit(condo_mlr1)

Interpretation

Most of the data poitns are scattered around the 0 line, hence we can safely conclude that the relationships between the dependent variable and independent variables are linear.

[Residual] ols_plot_resid_hist() or

{#| output: true} #| code-fold: true{r} ols_plot_resid_hist(condo_mlr1)

Interpretation

Residuals appear normally distributed

[Stat. test] ols_test_normality() - check normality

Code
ols_test_normality(condo_mlr1)
-----------------------------------------------
       Test             Statistic       pvalue  
-----------------------------------------------
Shapiro-Wilk              0.6856         0.0000 
Kolmogorov-Smirnov        0.1366         0.0000 
Cramer-von Mises         121.0768        0.0000 
Anderson-Darling         67.9551         0.0000 
-----------------------------------------------
Interpretation

p-values << 0.05(significance level). Hence we will reject the null H0 and infer that there is statistical evidence that the residual are not normally distributed

Spatial autocorrelation test

Code
# save residual of the hedonic pricing model as a data frame
mlr_output <- as.data.frame(condo_mlr1$residuals)

# join with condo_resale_sf
condo_resale_res_sf <- cbind(condo_resale_sf, mlr_output) %>% 
    mutate(MLR_RES = condo_mlr1$residuals)
# convert condo_resale_res_sf into a SpatialPointsDataFrame
condo_resale_sp <- as_Spatial(condo_resale_res_sf)
condo_resale_sp
class       : SpatialPointsDataFrame 
features    : 1436 
extent      : 14940.85, 43352.45, 24765.67, 48382.81  (xmin, xmax, ymin, ymax)
crs         : +proj=tmerc +lat_0=1.36666666666667 +lon_0=103.833333333333 +k=1 +x_0=28001.642 +y_0=38744.572 +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs 
variables   : 24
names       : POSTCODE, SELLING_PRICE, AREA_SQM, AGE,    PROX_CBD, PROX_CHILDCARE, PROX_ELDERLYCARE, PROX_URA_GROWTH_AREA, PROX_HAWKER_MARKET, PROX_KINDERGARTEN,    PROX_MRT,   PROX_PARK, PROX_PRIMARY_SCH, PROX_TOP_PRIMARY_SCH, PROX_SHOPPING_MALL, ... 
min values  :    18965,        540000,       34,   0, 0.386916393,    0.004927023,      0.054508623,          0.214539508,        0.051817113,       0.004927023, 0.052779424, 0.029064164,      0.077106132,          0.077106132,                  0, ... 
max values  :   828833,       1.8e+07,      619,  37, 19.18042832,     3.46572633,      3.949157205,           9.15540001,        5.374348075,       2.229045366,  3.48037319,  2.16104919,      3.928989144,          6.748192062,        3.477433767, ... 
tmap_mode("view")
Code
tm_shape(mpsz_3414) +
  tmap_options(check.and.fix = TRUE) +
  tm_polygons(alpha = 0.4) +
tm_shape(condo_resale_res_sf) +  
  tm_dots(col = "MLR_RES",
          alpha = 0.6,
          style="quantile") +
  tm_view(set.zoom.limits = c(11,14))
Interpretation

Signs of spatial autocorrelation. Now need to test for its significance

tmap_mode('plot')
Code
# Moran's I

# compute the distance-based weight matrix
nb <- dnearneigh(
    coordinates(condo_resale_sp),
    0,    # lower bound
    1500, # upper bound
    longlat = FALSE
)

# convert nb into spatial weights
nb_lw <- nb2listw(nb, style = 'W')

# perform Moran's I
lm.morantest(condo_mlr1, nb_lw)

    Global Moran I for regression residuals

data:  
model: lm(formula = SELLING_PRICE ~ AREA_SQM + AGE + PROX_CBD +
PROX_CHILDCARE + PROX_ELDERLYCARE + PROX_URA_GROWTH_AREA + PROX_MRT +
PROX_PARK + PROX_PRIMARY_SCH + PROX_SHOPPING_MALL + PROX_BUS_STOP +
NO_Of_UNITS + FAMILY_FRIENDLY + FREEHOLD, data = condo_resale_sf)
weights: nb_lw

Moran I statistic standard deviate = 24.366, p-value < 2.2e-16
alternative hypothesis: greater
sample estimates:
Observed Moran I      Expectation         Variance 
    1.438876e-01    -5.487594e-03     3.758259e-05 
Code
# summary(nb)
# summary(nb_lw)

Generate publication-ready summary tables

Code
ols_regress(condo_mlr1)
                                Model Summary                                 
-----------------------------------------------------------------------------
R                            0.807       RMSE                     751998.679 
R-Squared                    0.651       MSE                571471422208.592 
Adj. R-Squared               0.647       Coef. Var                    43.168 
Pred R-Squared               0.638       AIC                       42966.758 
MAE                     414819.628       SBC                       43051.072 
-----------------------------------------------------------------------------
 RMSE: Root Mean Square Error 
 MSE: Mean Square Error 
 MAE: Mean Absolute Error 
 AIC: Akaike Information Criteria 
 SBC: Schwarz Bayesian Criteria 

                                     ANOVA                                       
--------------------------------------------------------------------------------
                    Sum of                                                      
                   Squares          DF         Mean Square       F         Sig. 
--------------------------------------------------------------------------------
Regression    1.512586e+15          14        1.080418e+14    189.059    0.0000 
Residual      8.120609e+14        1421    571471422208.592                      
Total         2.324647e+15        1435                                          
--------------------------------------------------------------------------------

                                               Parameter Estimates                                                
-----------------------------------------------------------------------------------------------------------------
               model           Beta    Std. Error    Std. Beta       t        Sig           lower          upper 
-----------------------------------------------------------------------------------------------------------------
         (Intercept)     527633.222    108183.223                   4.877    0.000     315417.244     739849.200 
            AREA_SQM      12777.523       367.479        0.584     34.771    0.000      12056.663      13498.382 
                 AGE     -24687.739      2754.845       -0.167     -8.962    0.000     -30091.739     -19283.740 
            PROX_CBD     -77131.323      5763.125       -0.263    -13.384    0.000     -88436.469     -65826.176 
      PROX_CHILDCARE    -318472.751    107959.512       -0.084     -2.950    0.003    -530249.889    -106695.613 
    PROX_ELDERLYCARE     185575.623     39901.864        0.090      4.651    0.000     107302.737     263848.510 
PROX_URA_GROWTH_AREA      39163.254     11754.829        0.060      3.332    0.001      16104.571      62221.936 
            PROX_MRT    -294745.107     56916.367       -0.112     -5.179    0.000    -406394.234    -183095.980 
           PROX_PARK     570504.807     65507.029        0.150      8.709    0.000     442003.938     699005.677 
    PROX_PRIMARY_SCH     159856.136     60234.599        0.062      2.654    0.008      41697.849     278014.424 
  PROX_SHOPPING_MALL    -220947.251     36561.832       -0.115     -6.043    0.000    -292668.213    -149226.288 
       PROX_BUS_STOP     682482.221    134513.243        0.134      5.074    0.000     418616.359     946348.082 
         NO_Of_UNITS       -245.480        87.947       -0.053     -2.791    0.005       -418.000        -72.961 
     FAMILY_FRIENDLY     146307.576     46893.021        0.057      3.120    0.002      54320.593     238294.560 
            FREEHOLD     350599.812     48506.485        0.136      7.228    0.000     255447.802     445751.821 
-----------------------------------------------------------------------------------------------------------------

OR,

Code
tbl_regression(condo_mlr1, intercept = TRUE) %>% 
    
    # include model statistics as a table source note
    add_glance_source_note(
        label = list(sigma ~ "\U03C3"),
        include = c(
            r.squared, 
            adj.r.squared, 
            AIC, 
            statistic,
            p.value, 
            sigma))
Characteristic Beta 95% CI1 p-value
(Intercept) 527,633 315,417, 739,849 <0.001
AREA_SQM 12,778 12,057, 13,498 <0.001
AGE -24,688 -30,092, -19,284 <0.001
PROX_CBD -77,131 -88,436, -65,826 <0.001
PROX_CHILDCARE -318,473 -530,250, -106,696 0.003
PROX_ELDERLYCARE 185,576 107,303, 263,849 <0.001
PROX_URA_GROWTH_AREA 39,163 16,105, 62,222 <0.001
PROX_MRT -294,745 -406,394, -183,096 <0.001
PROX_PARK 570,505 442,004, 699,006 <0.001
PROX_PRIMARY_SCH 159,856 41,698, 278,014 0.008
PROX_SHOPPING_MALL -220,947 -292,668, -149,226 <0.001
PROX_BUS_STOP 682,482 418,616, 946,348 <0.001
NO_Of_UNITS -245 -418, -73 0.005
FAMILY_FRIENDLY 146,308 54,321, 238,295 0.002
FREEHOLD 350,600 255,448, 445,752 <0.001
R² = 0.651; Adjusted R² = 0.647; AIC = 42,967; Statistic = 189; p-value = <0.001; σ = 755,957
1 CI = Confidence Interval
Code
# More info
# https://www.danieldsjoberg.com/gtsummary/articles/tbl_regression.html

RDS

write_rds(mpsz_3414,'../ex11/data/rds/mpsz_3414.rds')
write_rds(condo_resale_sp,'../ex11/data/rds/condo_resale_sp.rds')